In this notebook we will test some methods of data window noralization, and the effect they have on the predictive behaviour of our model.
import os
import time
import math
from pathlib import Path
from glob import glob
from random import shuffle
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pywt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from data_processor_Stdnorm import DataLoader as DataLoader_Std
from default_model import Model
Define functions for plots
def plot_results_multiple(predicted_data, true_data, prediction_len):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(padding + data, label='Prediction')
#plt.legend(loc='lower right')
plt.show()
# increase size of plots:
default_dpi = mpl.rcParamsDefault['figure.dpi']
mpl.rcParams['figure.dpi'] = default_dpi*2
Different methods for normalization are coded here as DataLoader class overrides, with new definitions normalization methods for trials.
def TrainTestPlot(settings, features, normMethod='STD'):
if not os.path.exists(settings['saveDir']): os.makedirs(settings['saveDir'])
model = Model()
model.build_model(settings['input_timesteps'], settings['input_dim'])
pathlist = glob('..\data\IndicatorsAdded\**\*.csv')
shuffle(pathlist)
i = 1
t = len(pathlist)
for path in pathlist:
pathStr = str(path)
pairStr = pathStr.split('\\')[1][:-4]
print('USING DATA({}/{}): [{}]'.format(i, t, pathStr))
i += 1
if normMethod.upper() == 'STD':
data = DataLoader_Std(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX1':
data = Data_Loader_MinMax1(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX3':
data = Data_Loader_MinMax3(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX1_5':
data = Data_Loader_MinMax1_5(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'STD_WT':
data = Data_Loader_StdWT(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX1_WT':
data = Data_Loader_MinMaxWT(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX3_WT':
data = Data_Loader_MinMax3WT(pathStr, settings['train_test_split'], features)
elif normMethod.upper() == 'MINMAX1_5_WT':
data = Data_Loader_MinMax1_5_WT(pathStr, settings['train_test_split'], features)
x, y = data.get_train_data(seq_len=settings['sequence_length'], normalise=settings['normalize'])
# out-of memory generative training
steps_per_epoch = math.ceil((data.len_train - settings['sequence_length']) / settings['batch_size'])
model.train_generator(
data_gen = data.generate_train_batch(
seq_len = settings['sequence_length'],
batch_size = settings['batch_size'],
normalise = settings['normalize']
),
epochs = settings['epochs'],
batch_size = settings['batch_size'],
steps_per_epoch = steps_per_epoch,
save_dir = settings['saveDir'],
save_name = settings['model_name']
)
x_test, y_test = data.get_test_data(seq_len=settings['sequence_length'], normalise=settings['normalize'])
predictions = model.predict_sequences_multiple(x_test, settings['predict_sequence'], settings['predict_sequence'])
plot_results_multiple(predictions, y_test, settings['predict_sequence'])
class Data_Loader_MinMax1(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler()
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
normalised_window.append(normalised_col)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_MinMax3(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler((0,3))
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
normalised_window.append(normalised_col)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_MinMax1_5(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler((1,5))
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
normalised_window.append(normalised_col)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_StdWT(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = StandardScaler()
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
cA, cD = pywt.dwt(normalised_col, 'haar')
normalised_window.append(cA)
normalised_window.append(cD)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_MinMaxWT(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler()
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
cA, cD = pywt.dwt(normalised_col, 'haar')
normalised_window.append(cA)
normalised_window.append(cD)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_MinMax3_WT(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler((0,3))
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
cA, cD = pywt.dwt(normalised_col, 'haar')
normalised_window.append(cA)
normalised_window.append(cD)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
class Data_Loader_MinMax1_5_WT(DataLoader_Std):
def normalise_windows(self, window_data, single_window=False):
normalised_data = []
window_data = [window_data] if single_window else window_data
scaler = MinMaxScaler((1,5))
for window in window_data:
normalised_window = []
for col_i in range(window.shape[1]):
preNorm = window[:, col_i].reshape(-1, 1)
normalised_col = scaler.fit_transform(preNorm).reshape(-1)
cA, cD = pywt.dwt(normalised_col, 'haar')
normalised_window.append(cA)
normalised_window.append(cD)
normalised_window = np.array(normalised_window).T
normalised_data.append(normalised_window)
return np.array(normalised_data)
Preliminary settings and parameter configuration:
config = {
'data_path' : '../data/IndicatorsAdded',
'model_name' : 'Model_NoNorm',
'saveDir' : 'saved_models',
'train_test_split' : 0.96,
'sequence_length' : 50,
'epochs' : 1,
'batch_size' : 32,
'normalize' : False,
'input_timesteps' : 49,
'input_dim' : 16,
'predict_sequence' : 49
}
# specify which features/columns to use for training
# 'Close' should be listed first if this is what we are predicting
columns = [
"Close",
"High",
"Low",
"Volume",
"EMA_20",
"EMA_12_26_PDIFF",
"CCI",
"MACD",
"MACD_SIG",
"MACD_HIST",
"ATR",
"BOLBAND_UP",
"BOLBAND_MID",
"BOLBAND_LOW",
"MOM20",
"ROC10"
]
Here we run partial epochs, or full epochs, and analyze the returned prediction plots to decide what method/combination of scaling to use.
TrainTestPlot(config, columns)
config['normalize'] = True
config['model_name'] = 'Model_StdNorm'
config['input_dim'] = 16
config['sequence_length'] = 50
TrainTestPlot(config, columns, 'STD')
config['normalize'] = True
config['model_name'] = 'Model_MinMax1'
config['input_dim'] = 16
config['sequence_length'] = 50
TrainTestPlot(config, columns, 'MINMAX1')
config['normalize'] = True
config['model_name'] = 'Model_MinMax3'
config['input_dim'] = 16
config['sequence_length'] = 50
TrainTestPlot(config, columns, 'MINMAX3')
config['normalize'] = True
config['model_name'] = 'Model_MinMax1_5'
config['input_dim'] = 16
config['sequence_length'] = 50
TrainTestPlot(config, columns, 'MINMAX1_5')
This splits each feature into mother and father wavelets, which doubles our feature-column length. Model parameters are adjusted to conform to this.
config['normalize'] = True
config['model_name'] = 'Model_StdWT'
config['input_dim'] = 32
config['sequence_length'] = 100
TrainTestPlot(config, columns, "STD_WT")
config['normalize'] = True
config['model_name'] = 'Model_MinMax1_WT'
config['input_dim'] = 32
config['sequence_length'] = 100
TrainTestPlot(config, columns, "MINMAX1_WT")
config['normalize'] = True
config['model_name'] = 'Model_MinMax3_WT'
config['input_dim'] = 32
config['sequence_length'] = 100
TrainTestPlot(config, columns, "MINMAX3_WT")
config['normalize'] = True
config['model_name'] = 'Model_MinMax1_5_WT'
config['input_dim'] = 32
config['sequence_length'] = 100
TrainTestPlot(config, columns, "MINMAX1_5_WT")